#install.packages("class")
library(class)
k <- 7
train.labels <- train.data$label[1:nrow(training.set)]
set.seed(2612)
knn.fit <- knn(train = training.set,
test = validate.set, # note the use of validation set
cl = train.labels,
k = k)
validate.labels <- train.data$label[(nrow(training.set)+1):nrow(train.data)]
conf.mat <- table(Predictions = knn.fit,
Actual = validate.labels)
conf.mat
compute.eval.measures(conf.mat)
numFolds = trainControl( method = "cv", number = 10 )
kGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
getwd()
source("text_mining_utils.R")
spam.train.files <- list.files(path = "data/emails/spam-train", full.names = TRUE)
#spam.train.files[1:10]
nonspam.train.files <- list.files(path = "data/emails/nonspam-train", full.names = TRUE)
# nonspam.train.files[1:10]
train.data <- data.frame(fpath = c(spam.train.files, nonspam.train.files),
label = c(rep("spam", times=length(spam.train.files)),
rep("nonspam", times=length(nonspam.train.files))),
text = NA, stringsAsFactors = FALSE)
str(train.data)
for(i in 1:nrow(train.data)) {
train.data$text[i] <- read.text(train.data$fpath[i])
}
head(train.data)
train.data$label <- as.factor(train.data$label)
summary(train.data$label)
set.seed(2612)
# generate random numbers that follow uniform distribution
randomized <- runif(n = nrow(train.data))
# shuffle the rows in the order defined by the randomly generated numbers
train.data <- train.data[ order(randomized), ]
# check if the data rows have really been shuffled
train.data[1:10, 2]
spam.test.files <- list.files(path = "data/emails/spam-test", full.names = TRUE)
nonspam.test.files <- list.files(path = "data/emails/nonspam-test", full.names = TRUE)
test.data <- data.frame(fpath = c(spam.test.files, nonspam.test.files),
label = c(rep("spam", times=length(spam.test.files)),
rep("nonspam", times=length(nonspam.test.files))),
text = NA, stringsAsFactors = FALSE)
for(i in 1:nrow(test.data)) {
test.data$text[i] <- read.text(test.data$fpath[i])
}
test.data$label <- as.factor(test.data$label)
head(test.data)
all.data <- rbind(train.data, test.data)
dim(all.data)
library(tm) # f. for working with corpus are in the tm package
corpus <- Corpus(VectorSource(all.data$text))
dtm <- DocumentTermMatrix(x = corpus,
control = list(wordLengths = c(2,Inf)))
inspect(dtm)
dtm.reduced <- removeSparseTerms(dtm, sparse = 0.975)
dtm.reduced
dtm.final <- as.matrix(dtm.reduced)
dim(dtm.final)
training <- dtm.final[1:nrow(train.data),]
testing <- dtm.final[nrow(train.data)+1 : nrow(dtm.final),]
testing <- dtm.final[(nrow(train.data)+1):nrow(dtm.final),]
k <- 7
train.labels <- train.data$label
set.seed(2612)
knn.fit <- knn(train = training,
test = testing,
cl = train.labels,
k = k)
test.labels <- test.data$label
conf.mat <- table(Predictions = knn.fit,
Actual = test.labels)
conf.mat
compute.eval.measures(conf.mat)
numFolds = trainControl( method = "cv", number = 10 )
kGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
set.seed(2612)
knn.cv <- train(x = training,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = cpGrid)
set.seed(2612)
knn.cv <- train(x = training,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = kGrid)
knn.cv
plot(knn.cv)
knn.fit2 <- knn(train = training,
test = testing,
cl = train.labels,
k = 9)
knn.fit2 <- knn(train = training,
test = testing,
cl = train.labels,
k = 9)
conf.mat2 <- table(Predicted = knn.fit2, Actual = test.labels)
conf.mat2
k <- 5
train.labels <- train.data$label
set.seed(2612)
knn.fit <- knn(train = training,
test = testing,
cl = train.labels,
k = k)
test.labels <- test.data$label
conf.mat <- table(Predictions = knn.fit,
Actual = test.labels)
conf.mat
compute.eval.measures(conf.mat)
knn.fit2 <- knn(train = training,
test = testing,
cl = train.labels,
k = 9)
compute.eval.measures(conf.mat2)
spam.train.files <- list.files(path = "data/emails/spam-train", full.names = TRUE)
#spam.train.files[1:10]
nonspam.train.files <- list.files(path = "data/emails/nonspam-train", full.names = TRUE)
# nonspam.train.files[1:10]
train.data <- data.frame(fpath = c(spam.train.files, nonspam.train.files),
label = c(rep("spam", times=length(spam.train.files)),
rep("nonspam", times=length(nonspam.train.files))),
text = NA, stringsAsFactors = FALSE)
str(train.data)
for(i in 1:nrow(train.data)) {
train.data$text[i] <- read.text(train.data$fpath[i])
}
library(tm)
library(caret)
source("text_mining_utils.R")
library(tm)
library(caret)
source("text_mining_utils.R")
spam.train.files <- list.files(path = "data/emails/spam-train", full.names = TRUE)
#spam.train.files[1:10]
nonspam.train.files <- list.files(path = "data/emails/nonspam-train", full.names = TRUE)
# nonspam.train.files[1:10]
train.data <- data.frame(fpath = c(spam.train.files, nonspam.train.files),
label = c(rep("spam", times=length(spam.train.files)),
rep("nonspam", times=length(nonspam.train.files))),
text = NA, stringsAsFactors = FALSE)
str(train.data)
for(i in 1:nrow(train.data)) {
train.data$text[i] <- read.text(train.data$fpath[i])
}
head(train.data)
train.data$label <- as.factor(train.data$label)
summary(train.data$label)
spam.test.files <- list.files(path = "data/emails/spam-test", full.names = TRUE)
nonspam.test.files <- list.files(path = "data/emails/nonspam-test", full.names = TRUE)
test.data <- data.frame(fpath = c(spam.test.files, nonspam.test.files),
label = c(rep("spam", times=length(spam.test.files)),
rep("nonspam", times=length(nonspam.test.files))),
text = NA, stringsAsFactors = FALSE)
for(i in 1:nrow(test.data)) {
test.data$text[i] <- read.text(test.data$fpath[i])
}
test.data$label <- as.factor(test.data$label)
head(test.data)
all.data <- rbind(train.data, test.data)
dim(all.data)
corpus <- Corpus(VectorSource(all.data$text))
dtm <- DocumentTermMatrix(x = corpus,
control = list(wordLengths = c(2,Inf),
weighting = weightTfIdf))
inspect(dtm)
dtm.reduced <- removeSparseTerms(dtm, sparse = 0.975)
dtm.reduced
dtm.final <- as.matrix(dtm.reduced)
dim(dtm.final)
training.set <- dtm.final[1:nrow(train.data),]
test.set <- dtm.final[(nrow(train.data)+1):nrow(dtm.final),]
remove(training.set, test.set)
training.dtm <- dtm.final[1:nrow(train.data),]
test.dtm <- dtm.final[(nrow(train.data)+1):nrow(dtm.final),]
k <- 5
train.labels <- train.data$label
set.seed(2712)
knn.fit <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = k)
library(class) # for kNN classifier
k <- 5
train.labels <- train.data$label
set.seed(2712)
knn.fit <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = k)
test.labels <- test.data$label
conf.mat <- table(Predictions = knn.fit,
Actual = test.labels)
conf.mat
k <- 5
train.labels <- train.data$label
set.seed(2612)
knn.fit <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = k)
test.labels <- test.data$label
conf.mat <- table(Predictions = knn.fit,
Actual = test.labels)
conf.mat
compute.eval.measures(conf.mat)
numFolds = trainControl( method = "cv", number = 10 )
kGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
set.seed(2612)
knn.cv <- train(x = training,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = kGrid)
set.seed(2612)
knn.cv <- train(x = training.dtm,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = kGrid)
set.seed(2612)
knn.cv <- train(x = training.dtm,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = kGrid)
knn.cv
plot(knn.cv)
knn.fit2 <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = 13)
conf.mat2 <- table(Predicted = knn.fit2, Actual = test.labels)
conf.mat2
compute.eval.measures(conf.mat2)
knn1.eval <- compute.eval.measures(conf.mat)
knn1.eval
knn2.eval <- compute.eval.measures(conf.mat2)
knn2.eval
data.frame(rbind(knn1.eval, knn2.eval), row.names = c("k=5", "k=13"))
library(tm)
library(caret)
library(class) # for kNN classifier
source("text_mining_utils.R")
spam.train.files <- list.files(path = "data/emails/spam-train", full.names = TRUE)
#spam.train.files[1:10]
nonspam.train.files <- list.files(path = "data/emails/nonspam-train", full.names = TRUE)
# nonspam.train.files[1:10]
train.data <- data.frame(fpath = c(spam.train.files, nonspam.train.files),
label = c(rep("spam", times=length(spam.train.files)),
rep("nonspam", times=length(nonspam.train.files))),
text = NA, stringsAsFactors = FALSE)
str(train.data)
for(i in 1:nrow(train.data)) {
train.data$text[i] <- read.text(train.data$fpath[i])
}
# head(train.data)
train.data$label <- as.factor(train.data$label)
summary(train.data$label)
spam.test.files <- list.files(path = "data/emails/spam-test", full.names = TRUE)
nonspam.test.files <- list.files(path = "data/emails/nonspam-test", full.names = TRUE)
test.data <- data.frame(fpath = c(spam.test.files, nonspam.test.files),
label = c(rep("spam", times=length(spam.test.files)),
rep("nonspam", times=length(nonspam.test.files))),
text = NA, stringsAsFactors = FALSE)
for(i in 1:nrow(test.data)) {
test.data$text[i] <- read.text(test.data$fpath[i])
}
test.data$label <- as.factor(test.data$label)
# head(test.data)
all.data <- rbind(train.data, test.data)
dim(all.data)
corpus <- Corpus(VectorSource(all.data$text))
dtm <- DocumentTermMatrix(x = corpus,
control = list(wordLengths = c(2,Inf),
weighting = weightTfIdf))
inspect(dtm)
dtm.reduced <- removeSparseTerms(dtm, sparse = 0.975)
dtm.reduced
dtm.final <- as.matrix(dtm.reduced)
dim(dtm.final)
training.dtm <- dtm.final[1:nrow(train.data),]
test.dtm <- dtm.final[(nrow(train.data)+1):nrow(dtm.final),]
k <- 5
train.labels <- train.data$label
set.seed(2612)
knn.fit <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = k)
test.labels <- test.data$label
conf.mat <- table(Predictions = knn.fit,
Actual = test.labels)
conf.mat
knn1.eval <- compute.eval.measures(conf.mat)
knn1.eval
numFolds = trainControl( method = "cv", number = 10 )
kGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
set.seed(2612)
knn.cv <- train(x = training.dtm,
y = train.labels,
method = "knn",
trControl = numFolds,
tuneGrid = kGrid)
knn.cv
plot(knn.cv)
knn.fit2 <- knn(train = training.dtm,
test = test.dtm,
cl = train.labels,
k = 13)
conf.mat2 <- table(Predicted = knn.fit2, Actual = test.labels)
conf.mat2
knn2.eval <- compute.eval.measures(conf.mat2)
knn2.eval
data.frame(rbind(knn1.eval, knn2.eval), row.names = c("k=5", "k=13"))
#############################################################################
# ACKNOWLEDGEMENTS
# this example is partially based on the Chapter 10 of the R and Data Mining
# http://www.rdatamining.com/books/rdm
#############################################################################
####################
## GET THE TWEETS ##
####################
# load the tweets data
tweets.data <- read.csv(file = "data/tweets_sentiment_labelled.csv",
colClasses = c("character", "factor"))
str(tweets.data)
# examine a few positive and a few negative tweets
tweets.data$Tweet[tweets.data$Lbl=="POS"][1:10]
tweets.data$Tweet[tweets.data$Lbl=="NEG"][1:10]
# examine the distribution of class labels
table(tweets.data$Lbl)
###################################
## PREPROCESSING OF TWEETS' TEXT ##
###################################
# the tm package is required for text processing
# install.packages("tm")
library(tm)
# load the R script with auxiliary functions
source("text_mining_utils.R")
# build a corpus
tw.corpus <- Corpus(VectorSource(tweets.data$Tweet))
# examine what a corpus item looks like
str(tw.corpus[[1]])
# tm_map() f. (from the tm package) allows for performing different transformations on the corpus;
# list of frequently used transformations can be obtained with getTransformations() f.
getTransformations()
# the purpose of all the transformations is to reduce the diversity among the words
# and remove words that are of low importance
# the first transformation will be to convert text to lower case
tw.corpus <- tm_map(tw.corpus, tolower)
print.tweets(tw.corpus, 1, 20)
# when processing tweets, we often remove user references completely;
# however, this corpus is specific - it has many (meaningful) user references;
# those are mostly references to Twitter accounts of various tech companies;
# so, we will remove only '@' sign that marks usernames (@username);
# this will be done using regular expressions;
# an excellent introduction to regular expression is available at:
# http://regex.bastardsbook.com/
replaceUserRefs <- function(x) gsub("@(\\w+)", "\\1", x)
tw.corpus <- tm_map(tw.corpus, replaceUserRefs)
print.tweets( tw.corpus, from = 20, to = 60 )
# remove hash (#) sign from hastags
removeHash <- function(x) gsub("#([[:alnum:]]+)", "\\1", x)
tw.corpus <- tm_map(tw.corpus, removeHash)
print.tweets( tw.corpus, from = 20, to = 60 )
# replace URLs with the "URL" term
replaceURL <- function(x) gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "URL", x)
tw.corpus <- tm_map(tw.corpus, replaceURL)
print.tweets( tw.corpus, from = 20, to = 60 )
# replace links to pictures (e.g. pic.twitter.com/lbu9diufrf) with TW_PIC
replaceTWPic <- function(x) gsub("pic\\.twitter\\.com/[[:alnum:]]+",
"TW_PIC", x)
tw.corpus <- tm_map(tw.corpus, replaceTWPic)
print.tweets( tw.corpus, from = 20, to = 60 )
# replace :-), :), ;-), :D, :o with the "POS_SMILEY" term
replaceHappySmiley <- function(x) gsub("[:|;](-?)[\\)|o|O|D]",
"POS_SMILEY", x)
tw.corpus <- tm_map(tw.corpus, replaceHappySmiley)
print.tweets( tw.corpus, from = 20, to = 60 )
# replace :(, :-(, :/, >:(, >:O with the "NEG_SMILEY" term
replaceSadSmiley <- function(x) gsub("((>?):(-?)[\\(|/|O|o])",
"NEG_SMILEY", x)
tw.corpus <- tm_map(tw.corpus, replaceSadSmiley)
# remove stopwords from corpus;
# first examine the tm's set of stopwords
stopwords('english')[100:120]
# add a few extra ('corpus-specific') stop words (e.g. "apple", "rt")
# to the 'general' stopwords for the English language
tw.stopwords <- c(stopwords('english'), "apple", "rt")
tw.corpus <- tm_map(tw.corpus, removeWords, tw.stopwords)
print.tweets( tw.corpus, from = 30, to = 70 )
# remove punctuation
tw.corpus <- tm_map(tw.corpus, removePunctuation,
preserve_intra_word_contractions = TRUE,
preserve_intra_word_dashes = TRUE)
print.tweets( tw.corpus, from = 30, to = 70 )
# remove stand-alone numbers (but not numbers in e.g. iphone7 or g3)
removeStandAloneNumbers <- function(x) gsub(" \\d+ ", "", x)
tw.corpus <- tm_map(tw.corpus, removeStandAloneNumbers)
print.tweets( tw.corpus, from = 30, to = 70 )
# strip whitespace
tw.corpus <- tm_map(tw.corpus, stripWhitespace)
# do word stemming using the Snowball stemmer: http://snowball.tartarus.org/
# to use the Snawball stemmer in R, we need the SnowballC package
#install.packages("SnowballC")
library(SnowballC)
# since we might later want to have words in their 'regular' form,
# we will keep a copy of the corpus before stemming it
tw.corpus.backup <- tw.corpus
# now, do the stemming
tw.corpus <- tm_map(tw.corpus, stemDocument, language = "english")
print.tweets( tw.corpus, from = 30, to = 70)
#####################################
## BUILDING A DOCUMENT-TERM MATRIX ##
#####################################
# Document Term Matrix (DTM) represents the relationship between terms and documents,
# where each row stands for a document and each column for a term, and an entry is the
# weight of the term in the corresponding document
min.freq <- round(0.005*length(tw.corpus))
max.freq <- round(0.95*length(tw.corpus))
dtm <- DocumentTermMatrix(tw.corpus,
control = list(bounds = list(global = c(min.freq,max.freq)),
wordLengths = c(2,16), # the restriction on the word length
weighting = weightTf)) # term freq. weighting scheme
# Note: the 'global' parameter is altered to require a word to appear in at least ~0.5%
# and at most in 95% of tweets to be included in the matrix;
# check the documentation of the TermDocumentMatrix() f. for other useful control parameters
# examine the built DTM matrix
inspect(dtm)
# we have very sparse DTM matrix; so, we should better reduce the sparsity
# by removing overly sparse terms:
dtm.trimmed <- removeSparseTerms(dtm, sparse = 0.9875)
inspect(dtm.trimmed)
# examine the resulting DTM matrix:
# check the terms that appear at least 20 times in the whole corpus
findFreqTerms(dtm.trimmed, lowfreq = 20)
# we can also inspect the frequency of accurance of all the terms
head(colSums(as.matrix(dtm)))
# better if they are sorted
sort(colSums(as.matrix(dtm)), decreasing = T)
#################################################
## CLASSIFYING TWEETS USING NAIVE BAYES METHOD ##
#################################################
# Since we want to use DTM for classification purposes,
# we need to transform it into a data frame that
# can be passed to a function for building a classifier:
features.final <- as.data.frame(as.matrix(dtm.trimmed))
str(features.final)
# add the class label
features.final$CLASS_LBL <- tweets.data$Lbl
colnames(features.final)
# split the data into training and test sets
library(caret)
set.seed(1212)
train.indices <- createDataPartition(y = features.final$CLASS_LBL,
p = 0.85,
list = FALSE)
train.data <- features.final[train.indices,]
test.data <- features.final[-train.indices,]
# build NB classifier using all the features
#install.packages('e1071')
library(e1071)
nb1 <- naiveBayes(CLASS_LBL ~ .,
data = train.data,
laplace = 1) # laplace smoothing (correction)
# since each feature (word) has numerous zero values, when fitting the model,
# we include laplace smoothing to avoid zero values of conditional probabilities
# make predictions
nb1.pred <- predict(nb1, newdata = test.data, type = "class")
# create confusion matrix
cm1 <- table(true = test.data$CLASS_LBL, predicted = nb1.pred)
cm1
# evaluate the model
eval1 <- compute.eval.measures(cm1)
eval1
#install.packages('pROC')
library(pROC)
nb1.pred.prob <- predict(nb1, newdata = test.data, type = "raw")
nb1.pred.prob[1:10,]
nb.roc <- roc(response = as.numeric(test.data$CLASS_LBL),
predictor = nb1.pred.prob[,1], # probabilities of the 'positive' class
levels = c(2,1)) # define the order of levels corresponding to the negative (controls)
# and positive (cases) class
# plot the curve
plot.roc(x = nb.roc,
print.auc = TRUE) # print AUC measure
nb2.coords <- coords(roc = nb.roc,
x = "local maximas",
ret = c("accuracy", "sensitivity", "specificity", "thr"))
nb2.coords
nb2.coords <- coords(roc = nb.roc,
x = "local maximas",
ret = c("accuracy", "sensitivity", "specificity", "thr"))
nb2.coords[,9]
nb2.coords
opt.threshold <- nb2.coords[4,9]
nb1.pred.opt <- ifelse(test = nb1.pred.prob[,1] > opt.threshold,
yes = "NEG", no = "POS")
nb1.pred.opt <- as.factor(nb1.pred.opt)
cm.opt <- table(actual = test.data$CLASS_LBL, predicted = nb1.pred.opt)
cm.opt
# examine evaluation measures
eval2 <- compute.eval.measures(cm.opt)
eval2
# compare evaluation measures
data.frame(rbind(eval1, eval2), row.names = c("default_threshold", "ROC_based_theshold"))
